In [21]:
#!pip install geopandas
In [1]:
import pandas as pd
import geopandas as gpd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from tqdm import tqdm
from textblob import Blobber
import os
import plotly.express as px
import json
import re
In [2]:
def import_data(listings_file, reviews_file, neighborhood_geojson):
    '''Extracts data from file source path and returns pandas dataframes and geopandas dataframe'''
    return pd.read_csv(listings_file), pd.read_csv(reviews_file), gpd.read_file(neighborhood_geojson)
In [3]:
def data_clean():
    '''Cleans data to remove unnecessary punctuation, fixing the datatypes of columns'''
    global neighborhood_geo_data, listings_data, reviews_data
    listings_data['price'] = listings_data['price'].str[1:]
    listings_data['price'] = listings_data['price'].apply(lambda x: x.replace(',','') if "," in x else x)
    listings_data['price'] = listings_data['price'].astype("float")
    
    reviews_data['reviewer_id'] = reviews_data['reviewer_id'].astype("str")
    
    neighborhood_geo_data = neighborhood_geo_data.drop('neighbourhood_group', axis=1)
In [4]:
def plot_corr_price():
    '''Function to plot correlation of multiple columns listed below'''
    corr = listings_data[['price','minimum_nights','accommodates', 'beds', 'bedrooms','host_total_listings_count','number_of_reviews']].corr()
    # mask = np.triu(np.ones_like(corr, dtype=bool))
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    sns.heatmap(corr,  cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.show()
In [5]:
def sentiment_list(nb_analyzer=False):
    '''
    Creates a setiment analysis list for reviews of different houses. Uses NaiveBayes for analysis
    if flag is set, if not uses TextBlob to return sentiment subjectivity of the review
    '''
    NB_list = []
    if nb_analyzer:
        tb = Blobber(analyzer=NaiveBayesAnalyzer())
        for i in tqdm(reviews_data['comments']):
            NB_list.append(tb(str(i)).sentiment.p_pos if i!=np.nan else 0.5)
    else:
        for i in tqdm(reviews_data['comments']):
            NB_list.append(TextBlob(str(i)).sentiment.subjectivity if i!=np.nan else 0.5)
    return NB_list
In [6]:
def best_worst_listings(data):
    '''
    Creates a plotly plot for the best and worst listings
    '''
    data = data.groupby(by="listing_id").mean()['nb_opinion_bins'].sort_values(ascending=False)

    limit=5
    fig = make_subplots(rows=2, cols=1, subplot_titles=("Best Customer Satisfaction listings", "Worst Customer Satisfaction listings"))
    fig.add_trace( go.Bar(y = data.values[:limit], x = data.index.astype('str')[:limit]),  row=1, col=1)
    fig.add_trace( go.Bar(y = data.values[-limit:], x = data.index.astype('str')[-limit:]), row=2, col=1)
    return fig
In [7]:
def get_top_reviewers():
    '''
    Calculate top reviewers based on the number of reviews and returns dataframe with the data in desc order
    '''
    top_reviewer_details = reviews_data.groupby(by="reviewer_id").count()["id"].reset_index().sort_values(by="id", ascending = False)
    top_reviewer_details = pd.merge(top_reviewer_details, reviews_data, how="inner", left_on="reviewer_id", right_on="reviewer_id")[['reviewer_id','reviewer_name','id_x']].drop_duplicates()
    top_reviewer_details["count"] = top_reviewer_details["id_x"].values
    top_reviewer_details["id_and_name"] = top_reviewer_details['reviewer_id'].astype("str") +" "+ top_reviewer_details['reviewer_name']
    return top_reviewer_details
In [8]:
def get_sentiment_values(reviews_data):
    '''
    Calculate sentiment if sentiment_values.csv file does not exist. We do this to decrease the time of execution in subsequent runs
    '''
    if os.path.isfile("../Data/sentiment_values.csv"):
        temp_df = pd.read_csv("../Data/sentiment_values.csv")
        reviews_data["pattern_opinion"] = temp_df["pattern_opinion"]
        reviews_data["nb_opinion"] = temp_df["nb_opinion"]
    else:
        reviews_data["pattern_opinion"] = sentiment_list()
        reviews_data["nb_opinion"] = sentiment_list(nb_analyzer=True)
        reviews_data[["pattern_opinion", "nb_opinion"]].to_csv('../Data/sentiment_values.csv',index=False)
    return reviews_data
In [9]:
def plot_top_reviewers_and_opinions(limit=30):
    '''
    Plot top reviewers and their average POS opinions for both NB and pattern based sentiment analysis 
    '''

    fig = make_subplots(rows=3, cols=1, subplot_titles=("Best reviewers", "Average pos opinion based on bayesian sentiment analysis", "Average pos opinion based on pattern based sentiment analysis"),)


    review_opinion_df = pd.merge(get_top_reviewers().iloc[:limit,:],
             listing_count_reviews_data[["reviewer_id", "pattern_opinion", "nb_opinion"]].groupby(by="reviewer_id").mean().reset_index(), 
             left_on="reviewer_id", right_on="reviewer_id")

    fig.add_trace( go.Bar(y=review_opinion_df["count"][:limit], x = review_opinion_df["id_and_name"][:limit]), row=1, col=1)

    fig.add_trace( go.Bar(y = review_opinion_df["nb_opinion"].values[:limit], x = review_opinion_df["id_and_name"].values[:limit]), row=2, col=1)

    fig.add_trace( go.Bar(y = review_opinion_df["pattern_opinion"].values[:limit], x = review_opinion_df["id_and_name"].values[:limit]), row=3, col=1)
    fig.update_layout(height=700, title = "Reviewers and their opinions",xaxis=dict(tickvals=[]), xaxis2=dict(tickvals=[]), xaxis3=dict(tickvals=[]))

    return fig
In [10]:
def plot_busiest_months():
    '''
    Which months have the highest and lowest bookings
    '''
    count_by_months = reviews_data['date'].str[5:7].value_counts()

    fig = go.Figure(
        data=[go.Bar(y = count_by_months.values, x = count_by_months.index)],
        layout_title_text="Busiest Months"
    )
    return fig
In [11]:
listings_data, reviews_data, neighborhood_geo_data = import_data("../Data/listings.csv", "../Data/reviews.csv",  "../Data/neighbourhoods.geojson")
data_clean()
In [12]:
reviews_data = get_sentiment_values(reviews_data)
count_merge_data = pd.merge(reviews_data, get_top_reviewers(), left_on='reviewer_id', right_on='reviewer_id')
count_merge_data["count_bins"]=pd.cut(count_merge_data['count'], bins = 3 , labels = [0.3,0.5,1])
count_merge_data["count_bins"] = count_merge_data["count_bins"].astype("float")
# reviews_data.head()
In [13]:
count_merge_data["nb_opinion_bins"] = count_merge_data["nb_opinion"]*count_merge_data["count_bins"]
count_merge_data["pattern_opinion_bins"] = count_merge_data["pattern_opinion"]*count_merge_data["count_bins"]
In [14]:
listing_count_reviews_data = pd.merge(count_merge_data.drop("id", axis=1), listings_data[["id","number_of_reviews"]], right_on="id", left_on="listing_id")
# listing_count_reviews_data["number_of_reviews"].unique()
listing_count_reviews_data["review_bins"]=pd.cut(listing_count_reviews_data['number_of_reviews'], bins = 3 , labels = [0.3,0.5,1])
listing_count_reviews_data["review_bins"] = listing_count_reviews_data["review_bins"].astype("float")
# listing_count_reviews_data.head()

Which month or months are the busiest?¶

Assuming that the reviews are written in the few days of staying in the BNB.¶

In [16]:
plot_busiest_months()

How is price correlated with the factors?¶

We check with factors like number of bedrooms, minimum nights, number of beds, how many people it accomodates, number of ratings and total number of listings of the property?¶

In [17]:
plot_corr_price()

Who has the highest reviews and what are their average opinion level?¶

Since the ticks are long and overlapping I removed them. We can check the names and values by hovering on the bars. All three match the reviewers in the exact same order.¶

In [18]:
plot_top_reviewers_and_opinions()

What are the best and worst listings?¶

"Update equation"¶

In [19]:
best_worst_listings(listing_count_reviews_data)

K-Means to cluster neighborhoods¶

In [23]:
listings_with_mean_reviews.columns
Out[23]:
Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'calendar_last_scraped', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'license', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month',
       'listing_id', 'nb_opinion', 'pattern_opinion'],
      dtype='object')
In [22]:
listings_with_mean_reviews = pd.merge(listings_data, listing_count_reviews_data.groupby(by="listing_id").mean()[["nb_opinion","pattern_opinion"]].reset_index(), left_on="id", right_on="listing_id")
# listings_with_mean_reviews.columns
In [24]:
def create_amenities_list(x):
    temp_lst = [0]*5
    if x==5:
        return [1,1,1,1,1]
    else:
        if ' washer' in x: temp_lst[0]=1
        if 'alarm' in x: temp_lst[1]=1
        if 'conditioning' in x: temp_lst[2]=1
        if 'essentials' in x: temp_lst[3]=1
        if 'wifi' in x: temp_lst[4]=1
    return temp_lst
In [27]:
listings_with_mean_reviews[['washer', 'alarm', 'conditioning', 'essentials', "wifi"]] = listings_with_mean_reviews["amenities"].str.replace("\[|\]|\"","").str.lower().apply(lambda x: sorted(list(set(re.findall(" washer|wifi|alarm|conditioning|essentials", x))))).apply(lambda x: create_amenities_list(x)).apply(pd.Series)
# listings_with_mean_reviews.head()
/tmp/ipykernel_4495/4286936106.py:1: FutureWarning:

The default value of regex will change from True to False in a future version.

In [ ]:
# df = listings_with_mean_reviews["amenities"].str.replace("\[|\]|\"","").str.split(",").explode().str.strip().str.lower().reset_index().drop('index',axis=1)['amenities'].str.split(" ").explode().value_counts().head(50)
# df
In [28]:
def modify_property_type(x):
    '''
    TODO
    '''
    if "apartment" in x:return 'apartment'
    if "room" in x:return 'room'
    if "entire bed" in x: return 'room'
    if 'entire place' in x: return 'house'
    if "condominium" in x:return 'condominium'
    if "suite" in x:return 'suite'
    if "loft" in x:return 'loft'
    if "casa" in x:return 'house'
    if "house" in x:return 'house'
    if 'boat' in x:return 'boat'
    if 'castle' in x:return 'house'
    else:
        return "None"
In [29]:
listings_with_mean_reviews["property_type"] = listings_with_mean_reviews["property_type"].apply(lambda x: modify_property_type(x.lower()))
In [30]:
listings_with_mean_reviews = listings_with_mean_reviews.replace(np.nan, None)
listings_with_mean_reviews = listings_with_mean_reviews.fillna({'review_scores_location':listings_with_mean_reviews[["review_scores_location"]].mean()})
In [31]:
from sklearn.cluster import KMeans
kmeans = KMeans(algorithm='elkan').fit(listings_with_mean_reviews[["availability_365", "reviews_per_month", "nb_opinion", 'review_scores_location', "price","washer","conditioning","essentials","alarm","wifi"]])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [31], in <cell line: 2>()
      1 from sklearn.cluster import KMeans
----> 2 kmeans = KMeans(algorithm='elkan').fit(listings_with_mean_reviews[["availability_365", "reviews_per_month", "nb_opinion", 'review_scores_location', "price","washer","conditioning","essentials","alarm","wifi"]])

File ~/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:1137, in KMeans.fit(self, X, y, sample_weight)
   1111 def fit(self, X, y=None, sample_weight=None):
   1112     """Compute k-means clustering.
   1113 
   1114     Parameters
   (...)
   1135         Fitted estimator.
   1136     """
-> 1137     X = self._validate_data(
   1138         X,
   1139         accept_sparse="csr",
   1140         dtype=[np.float64, np.float32],
   1141         order="C",
   1142         copy=self.copy_x,
   1143         accept_large_sparse=False,
   1144     )
   1146     self._check_params(X)
   1147     random_state = check_random_state(self.random_state)

File ~/anaconda3/lib/python3.9/site-packages/sklearn/base.py:566, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    564     raise ValueError("Validation should be done on X, y or both.")
    565 elif not no_val_X and no_val_y:
--> 566     X = check_array(X, **check_params)
    567     out = X
    568 elif no_val_X and not no_val_y:

File ~/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:800, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    794         raise ValueError(
    795             "Found array with dim %d. %s expected <= 2."
    796             % (array.ndim, estimator_name)
    797         )
    799     if force_all_finite:
--> 800         _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
    802 if ensure_min_samples > 0:
    803     n_samples = _num_samples(array)

File ~/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:114, in _assert_all_finite(X, allow_nan, msg_dtype)
    107     if (
    108         allow_nan
    109         and np.isinf(X).any()
    110         or not allow_nan
    111         and not np.isfinite(X).all()
    112     ):
    113         type_err = "infinity" if allow_nan else "NaN, infinity"
--> 114         raise ValueError(
    115             msg_err.format(
    116                 type_err, msg_dtype if msg_dtype is not None else X.dtype
    117             )
    118         )
    119 # for object dtype data, we only check for NaNs (GH-13254)
    120 elif X.dtype == np.dtype("object") and not allow_nan:

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
In [ ]:
kmeans.get_params()
In [ ]:
import json
In [ ]:
neighborhood_json = json.load(open("../Data/neighbourhoods.geojson"))
In [ ]:
df = listings_data.groupby(by="neighbourhood_cleansed").mean()['price'].reset_index()
center = {'lat':neighborhood_json['features'][0]['geometry']['coordinates'][0][0][0][1],'lon':neighborhood_json['features'][0]['geometry']['coordinates'][0][0][0][0]}
fig = px.choropleth_mapbox(df, geojson=neighborhood_json, locations='neighbourhood_cleansed', color='price',
                            featureidkey="properties.neighbourhood",
                           mapbox_style="open-street-map",
                           zoom=9.5,
                           opacity=0.5,
                           center=center,
                           range_color=(df['price'].min(), df['price'].max()))
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: